In [1]:
# Computations
import numpy as np
import pandas as pd

# scipy
import scipy.stats as stats
from scipy.stats import norm

# sklearn
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.utils.fixes import loguniform

from sklearn.ensemble import RandomForestClassifier

# Text
from colorama import Fore, Back, Style
import re

# Visualisation libraries
import seaborn as sns

import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec

import missingno as msno

from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
from wordcloud import WordCloud
from plotly.subplots import make_subplots
import plotly.express as px

# Graphics in retina format 
%config InlineBackend.figure_format = 'retina' 

# sns setting
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("whitegrid")

# plt setting
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

Telco Customer Churn (Classification)

In this article, we would like to predict customer churn for Telco Customer Churn data.

Table of Contents

Dataset

  • Customers who left within the last month – the column is called Churn
  • Services that each customer has signed up for – phone, multiple lines, internet, online security, online backup, device * protection, tech support, and streaming TV and movies
  • Customer account information – how long they’ve been a customer, contract, payment method, paperless billing, monthly charges, and total charges
  • Demographic info about customers – gender, age range, and if they have partners and dependents
Columns Description
customerID Customer ID
gender Whether the customer is a male or a female
SeniorCitizen Whether the customer is a senior citizen or not (1, 0)
Partner Whether the customer has a partner or not (Yes, No)
Dependents Whether the customer has dependents or not (Yes, No)
tenure Number of months the customer has stayed with the company
PhoneService Whether the customer has a phone service or not (Yes, No)
MultipleLines Whether the customer has multiple lines or not (Yes, No, No phone service)
InternetService Customer’s internet service provider (DSL, Fiber optic, No)
OnlineSecurity Whether the customer has online security or not (Yes, No, No internet service)
OnlineBackup Whether the customer has an online backup or not (Yes, No, No internet service)
DeviceProtection Whether the customer has device protection or not (Yes, No, No internet service)
TechSupport Whether the customer has tech support or not (Yes, No, No internet service)
StreamingTV Whether the customer has streaming TV or not (Yes, No, No internet service)
StreamingMovies Whether the customer has streaming movies or not (Yes, No, No internet service)
Contract The contract term of the customer (Month-to-month, One year, Two years)
PaperlessBilling Whether the customer has paperless billing or not (Yes, No)
PaymentMethod The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))
MonthlyCharges The amount charged to the customer monthly
TotalCharges The total amount charged to the customer
Churn Whether the customer churned or not (Yes or No)
In [2]:
Data = pd.read_csv('telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

def Data_info(Inp, Only_NaN = False):
    Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
    Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
    Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
    if Only_NaN:
        Out = Out.loc[Out['Number of NaN Values']>0]
    return Out

def dtypes_group(Inp):
    Temp = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
    Out = pd.DataFrame(index =Temp['Data Type'].unique(), columns = ['Columns'])
    for c in Temp['Data Type'].unique():
        Out.loc[Out.index == c, 'Columns'] = [Temp.loc[Temp['Data Type'] == c].index.tolist()]
    return Out

_ = msno.bar(Data, figsize=(16,5), fontsize=14, log=False, color="#34495e")

def text_sep(txt): return re.sub(r"(\w)([A-Z])", r"\1 \2", txt)

def col_details(Col):
    print(Back.BLACK + Fore.CYAN + Style.NORMAL + '%s:' % text_sep(Col))
    print(Style.RESET_ALL)
    print('%s' % ', '.join(Data[Col].unique()))
    
Data.rename(columns = {'gender':'Gender', 'tenure':'Tenure'}, inplace = True)
Data.columns = [text_sep(txt) for txt in Data.columns.tolist()]

Initial Analysis

Churn Percentage by Gender

In [3]:
Feature = 'Gender'
Temp = Data.groupby([Feature,'Churn'])[Feature].agg({'count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
display(Temp)
C = ['aquamarine', 'steelblue']
SC = 'Navy'
Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Churn', x= 'Percentage', orientation='h',
             color = Feature, text = 'Percentage', color_discrete_sequence= C, height= 220)
fig.update_traces(marker_line_color= SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Churn Percentage by Gender', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()


fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(labels= Temp.loc[Temp.Churn == 'Yes',Feature].values,
                     values= Temp.loc[Temp.Churn == 'Yes','count'].values,
                     name= 'Churn [Yes]', textfont=dict(size=16),
                     marker=dict(colors = C, line=dict(color='black', width=1))), 1, 1)

fig.add_trace(go.Pie(labels=Temp.loc[Temp.Churn == 'No', Feature].values,
                     values=Temp.loc[Temp.Churn == 'No','count'].values,
                     name= 'Churn [No]', textfont=dict(size=16),
                     marker=dict(colors = C, line=dict(color='black', width=1))), 1, 2)

fig.update_traces(hole=.6, marker_line_color= SC, marker_line_width=1, opacity=1)

fig.update_layout(legend_title=Feature, font=dict(size=14), legend=dict(orientation="v"),
                  annotations=[dict(text='Churned', x=0.18, y=0.5, font_size=16, showarrow=False),
                               dict(text='Not Churned', x=0.845, y=0.5, font_size=16, showarrow=False)], height = 400)
fig.show()
del Feature
count Percentage
Gender Churn
Female No 2549 36.19
Yes 939 13.33
Male No 2625 37.27
Yes 930 13.20

It can be seen that there is a balance between the two genders among churned customers.

Churn Percentage by Senior Citizen

In [4]:
Temp = Data.copy()
Temp['Senior Citizen'] = Temp['Senior Citizen'].map(lambda x: 'Yes' if x ==1 else 'No')
Feature = 'Senior Citizen'
Temp = Temp.groupby([Feature,'Churn'])[Feature].agg({'count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
display(Temp)
C = ['greenyellow', 'seagreen']
SC = 'DarkGreen'
Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Churn', x= 'Percentage', orientation='h',
             color = Feature, text = 'Percentage', color_discrete_sequence= C, height= 220)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Churn Percentage by Gender', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()


fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(labels= Temp.loc[Temp.Churn == 'Yes',Feature].values,
                     values= Temp.loc[Temp.Churn == 'Yes','count'].values,
                     name= 'Churn [Yes]', textfont=dict(size=16),
                     marker=dict(colors = C, line=dict(color='black', width=1))), 1, 1)

fig.add_trace(go.Pie(labels=Temp.loc[Temp.Churn == 'No', Feature].values,
                     values=Temp.loc[Temp.Churn == 'No','count'].values,
                     name= 'Churn [No]', textfont=dict(size=16),
                     marker=dict(colors = C, line=dict(color='black', width=1))), 1, 2)

fig.update_traces(hole=.6, marker_line_color=SC, marker_line_width=1, opacity=1)

fig.update_layout(legend_title=Feature, font=dict(size=14), legend=dict(orientation="v"),
                  annotations=[dict(text='Churned', x=0.18, y=0.5, font_size=16, showarrow=False),
                               dict(text='Not Churned', x=0.845, y=0.5, font_size=16, showarrow=False)], height = 400)
fig.show()
del Feature
count Percentage
Senior Citizen Churn
No No 4508 64.01
Yes 1393 19.78
Yes No 666 9.46
Yes 476 6.76

It can be seen that only 25.5% of the senior customers were churned.

Churn Percentage by Partner

In [5]:
Feature = 'Partner'
Temp = Data.groupby([Feature,'Churn'])[Feature].agg({'count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)

display(Temp)
C = ['bisque', 'orange']
SC = 'DarkOrange'
Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Churn', x= 'Percentage', orientation='h',
             color = Feature, text = 'Percentage', color_discrete_sequence= C, height= 220)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Churn Percentage by Gender', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()


fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(labels= Temp.loc[Temp.Churn == 'Yes',Feature].values,
                     values= Temp.loc[Temp.Churn == 'Yes','count'].values,
                     name= 'Churn [Yes]', textfont=dict(size=16),
                     marker=dict(colors = C, line=dict(color='black', width=1))), 1, 1)

fig.add_trace(go.Pie(labels=Temp.loc[Temp.Churn == 'No', Feature].values,
                     values=Temp.loc[Temp.Churn == 'No','count'].values,
                     name= 'Churn [No]', textfont=dict(size=16),
                     marker=dict(colors = C, line=dict(color='black', width=1))), 1, 2)

fig.update_traces(hole=.6, marker_line_color=SC, marker_line_width=1, opacity=1)

fig.update_layout(legend_title=Feature, font=dict(size=14), legend=dict(orientation="v"),
                  annotations=[dict(text='Churned', x=0.18, y=0.5, font_size=16, showarrow=False),
                               dict(text='Not Churned', x=0.845, y=0.5, font_size=16, showarrow=False)], height = 400)
fig.show()
del Feature
count Percentage
Partner Churn
No No 2441 34.66
Yes 1200 17.04
Yes No 2733 38.80
Yes 669 9.50

Over 64% of churned customers did not have any partners.

Churn Percentage by Dependents

In [6]:
Feature = 'Dependents'
Temp = Data.groupby([Feature,'Churn'])[Feature].agg({'count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)

display(Temp)
C = ['pink', 'hotpink']
SC = 'DarkRed'
Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Churn', x= 'Percentage', orientation='h',
             color = Feature, text = 'Percentage', color_discrete_sequence= C, height= 220)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Churn Percentage by Gender', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()


fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(labels= Temp.loc[Temp.Churn == 'Yes',Feature].values,
                     values= Temp.loc[Temp.Churn == 'Yes','count'].values,
                     name= 'Churn [Yes]', textfont=dict(size=16),
                     marker=dict(colors = C, line=dict(color='black', width=1))), 1, 1)

fig.add_trace(go.Pie(labels=Temp.loc[Temp.Churn == 'No', Feature].values,
                     values=Temp.loc[Temp.Churn == 'No','count'].values,
                     name= 'Churn [No]', textfont=dict(size=16),
                     marker=dict(colors = C, line=dict(color='black', width=1))), 1, 2)

fig.update_traces(hole=.6, marker_line_color=SC, marker_line_width=1, opacity=1)

fig.update_layout(legend_title=Feature, font=dict(size=14), legend=dict(orientation="v"),
                  annotations=[dict(text='Churned', x=0.18, y=0.5, font_size=16, showarrow=False),
                               dict(text='Not Churned', x=0.845, y=0.5, font_size=16, showarrow=False)], height = 400)
fig.show()
del Feature
count Percentage
Dependents Churn
No No 3390 48.13
Yes 1543 21.91
Yes No 1784 25.33
Yes 326 4.63

Over 82 percent of churned customers did not have any dependents.

Churn Percentage by Tenure

In [7]:
Feature = 'Tenure'
Temp = Data.groupby([Feature,'Churn'])[Feature].agg({'count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Churn', x= 'Percentage', orientation='h',
             color = Feature, text = 'Percentage',color_continuous_scale= 'ylgn', height= 450)
fig.show()


C = ['violet', 'mediumorchid']
fig = px.bar(Temp, x= Feature, y= 'Percentage',
             color = 'Churn', text = 'Percentage', color_discrete_sequence= C, height= 500)

fig.update_traces(marker_line_color='Indigo', marker_line_width=1.2, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['yaxis'].update(range=[0, 10])
fig.update_layout(title = 'Churn Percentage by Gender', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
del Feature

Customers with a higher tenure tend to churn less.

Churn Percentage by Contract

In [8]:
Feature = 'Contract'
Temp = Data.groupby([Feature,'Churn'])[Feature].agg({'count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)

display(Temp)
C = ['greenyellow', 'limeGreen','DarkGreen']
SC = 'DarkGreen'
Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Churn', x= 'Percentage', orientation='h',
             color = Feature, text = 'Percentage', color_discrete_sequence= C, height= 240)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Churn Percentage by Gender', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()


fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(labels= Temp.loc[Temp.Churn == 'Yes',Feature].values,
                     values= Temp.loc[Temp.Churn == 'Yes','count'].values,
                     name= 'Churn [Yes]', textfont=dict(size=16),
                     marker=dict(colors = C, line=dict(color='black', width=1))), 1, 1)

fig.add_trace(go.Pie(labels=Temp.loc[Temp.Churn == 'No', Feature].values,
                     values=Temp.loc[Temp.Churn == 'No','count'].values,
                     name= 'Churn [No]', textfont=dict(size=16),
                     marker=dict(colors = C, line=dict(color='black', width=1))), 1, 2)

fig.update_traces(hole=.6, marker_line_color= SC, marker_line_width=1, opacity=1)

fig.update_layout(legend_title=Feature, font=dict(size=14), legend=dict(orientation="v"),
                  annotations=[dict(text='Churned', x=0.18, y=0.5, font_size=16, showarrow=False),
                               dict(text='Not Churned', x=0.845, y=0.5, font_size=16, showarrow=False)], height = 400)
fig.show()
del Feature
count Percentage
Contract Churn
Month-to-month No 2220 31.52
Yes 1655 23.50
One year No 1307 18.56
Yes 166 2.36
Two year No 1647 23.38
Yes 48 0.68

The majority of churned customers were on a month-to-month base contract.

Churn Percentage by Payment Method

In [9]:
Feature = 'Payment Method'
Temp = Data.groupby([Feature,'Churn'])[Feature].agg({'count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)

display(Temp)
C = ['azure','paleturquoise','steelblue','MidnightBlue']
SC = 'Navy'
Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Churn', x= 'Percentage', orientation='h',
             color = Feature, text = 'Percentage', color_discrete_sequence= C, height= 260)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Churn Percentage by Gender', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()


fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(labels= Temp.loc[Temp.Churn == 'Yes',Feature].values,
                     values= Temp.loc[Temp.Churn == 'Yes','count'].values,
                     name= 'Churn [Yes]', textfont=dict(size=16),
                     marker=dict(colors = C, line=dict(color='black', width=1))), 1, 1)

fig.add_trace(go.Pie(labels=Temp.loc[Temp.Churn == 'No', Feature].values,
                     values=Temp.loc[Temp.Churn == 'No','count'].values,
                     name= 'Churn [No]', textfont=dict(size=16),
                     marker=dict(colors = C, line=dict(color='black', width=1))), 1, 2)

fig.update_traces(hole=.6, marker_line_color= SC, marker_line_width=1, opacity=1)

fig.update_layout(legend_title=Feature, font=dict(size=14), legend=dict(orientation="v"),
                  annotations=[dict(text='Churned', x=0.18, y=0.5, font_size=16, showarrow=False),
                               dict(text='Not Churned', x=0.85, y=0.5, font_size=16, showarrow=False)], height = 400)
fig.show()
del Feature
count Percentage
Payment Method Churn
Bank transfer (automatic) No 1286 18.26
Yes 258 3.66
Credit card (automatic) No 1290 18.32
Yes 232 3.29
Electronic check No 1294 18.37
Yes 1071 15.21
Mailed check No 1304 18.51
Yes 308 4.37

Customers with an automatic payment method churned less.

Preprocessing

Int Columns

In [10]:
Data_types = dtypes_group(Data)
display(Data_types)
Temp = Data_types.loc[Data_types.index == 'int64'].values[0,0]
Data[Temp] = Data[Temp].astype(int)
del Temp
Columns
int64 [Senior Citizen, Tenure]
float64 [Monthly Charges]
object [customer ID, Payment Method, Paperless Billin...

Float Columns

In [11]:
Temp = Data_types.loc[Data_types.index == 'float64'].values[0,0]
Data[Temp] = Data[Temp].astype(float)
del Temp
Data['Total Charges'] = pd.to_numeric(Data['Total Charges'], errors='coerce')

Yes/No Columns

First, let's convert all Yes/No columns using as follows

\begin{cases} 0 &\mbox{No}\\ 1 &\mbox{Yes}\end{cases}
In [12]:
Temp = []
for i in Data_types.loc[Data_types.index == 'object'].values[0,0]:
    if set(Data[i].unique().tolist()) == {'No', 'Yes'}:
        Temp.append(i)
        
Data[Temp] = Data[Temp].replace({'Yes':1, 'No':0}).astype(int)
del Temp

However, some other columns can be converted similarly; however, we need to create a new feature.

In [13]:
Temp = []
for i in Data_types.loc[Data_types.index == 'object'].values[0,0]:
    if set(Data[i].unique().tolist()) == {'No', 'No internet service', 'Yes'}:
        Temp.append(i)
        
print('Columns: %s' %', '.join(Temp))
Columns: Streaming Movies, Streaming TV, Tech Support, Device Protection, Online Backup, Online Security

Note that,

In [14]:
col_details('Internet Service')
Internet Service:

DSL, Fiber optic, No

This Column can be coded as follows

$$\mbox{InternetServiceType} = \begin{cases} 0 &\mbox{No} \\ 1 &\mbox{DSL}\\ 2 &\mbox{Fiber optic}\end{cases}$$
In [15]:
def myfun(x):
    if x == 'No':
        return 0
    elif x == 'DSL':
        return 1
    else:
        return 2

Data['Internet Service'] = Data['Internet Service'].apply(lambda x: myfun(x)).astype(int)
del myfun

Since we have already included No interent service in InternetService, we can code the rest as,

\begin{cases} 0 &\mbox{No, No internet service}\\ 1 &\mbox{Yes}\end{cases}
In [16]:
Data[Temp] = Data[Temp].applymap(lambda x: 1 if x =='Yes' else 0).astype(int)

Since, there is already a feature as PhoneService, for MultipleLines, we can try $$ \mbox{MultipleLines} = \begin{cases} 0 &\mbox{No, No phone service}\\ 1 &\mbox{Yes}\end{cases} $$

In [17]:
Data['Multiple Lines'] = Data['Multiple Lines'].map(lambda x: 1 if x =='Yes' else 0).astype(int)

Other Columns

In [18]:
Data_types = dtypes_group(Data)
Temp = Data_types.loc[Data_types.index == 'object'].values[0,0]
print('Columns: %s' %', '.join(Temp))
Columns: Contract, Gender, Payment Method, customer ID

Contract

In [19]:
col_details('Contract')
Contract:

Month-to-month, One year, Two year
\begin{cases} 0 &\mbox{Month-to-month}\\ 1 &\mbox{One year}\\ 2 &\mbox{Two year} \end{cases}
In [20]:
Data['Contract'] = Data['Contract'].replace({'Month-to-month':0, 'One year':1, 'Two year':2}).astype(int)

Gender

$$ \mbox{Gender} = \begin{cases} 0 &\mbox{Female}\\ 1 &\mbox{Male}\end{cases} $$
In [21]:
Data['Gender'] = Data['Gender'].map(lambda x: 1 if x =='Male' else 0).astype(int)

PaymentMethod

In [22]:
col_details('Payment Method')
Payment Method:

Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic)

In this case, we can not rank these values. Therefore,

In [23]:
Data = Data.join(pd.get_dummies(Data['Payment Method']).astype(int))
Data = Data.drop(columns = ['Payment Method'])
Data_types = dtypes_group(Data)
display(Data_types)
Columns
int32 [Device Protection, Credit card (automatic), B...
float64 [Monthly Charges, Total Charges]
object [customer ID]

Data Correlations

Let's take a look at the variance of the features.

In [24]:
display(Data.drop(columns = ['Churn']).var().sort_values(ascending = False).to_frame(name= 'Variance')\
       .style.background_gradient(cmap=sns.light_palette("green", as_cmap=True)).set_precision(2))
Variance
Total Charges 5138252.41
Monthly Charges 905.41
Tenure 603.17
Contract 0.70
Internet Service 0.61
Gender 0.25
Partner 0.25
Multiple Lines 0.24
Paperless Billing 0.24
Streaming Movies 0.24
Streaming TV 0.24
Online Backup 0.23
Device Protection 0.23
Electronic check 0.22
Dependents 0.21
Tech Support 0.21
Online Security 0.20
Mailed check 0.18
Bank transfer (automatic) 0.17
Credit card (automatic) 0.17
Senior Citizen 0.14
Phone Service 0.09
In [25]:
def Correlation_Plot (Df,Fig_Size):
    Correlation_Matrix = Df.corr().round(2)
    mask = np.zeros_like(Correlation_Matrix)
    mask[np.triu_indices_from(mask)] = True
    for i in range(len(mask)):
        mask[i,i]=0
    Fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
    sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True, 
                cmap =sns.color_palette("Greens", n_colors=10), linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": .6})

Correlation_Plot (Data, 16)

Correlations of features with customer Churn.

In [26]:
Temp = Data.corr().round(2)
Temp['Churn'].sort_values().to_frame(name= 'Correlation')[:-1].style.background_gradient(cmap='RdYlGn',
                                                                                 subset=['Correlation']).set_precision(2)
Out[26]:
Correlation
Contract -0.40
Tenure -0.35
Total Charges -0.20
Online Security -0.17
Tech Support -0.16
Dependents -0.16
Partner -0.15
Credit card (automatic) -0.13
Bank transfer (automatic) -0.12
Mailed check -0.09
Online Backup -0.08
Device Protection -0.07
Gender -0.01
Phone Service 0.01
Multiple Lines 0.04
Streaming TV 0.06
Streaming Movies 0.06
Senior Citizen 0.15
Paperless Billing 0.19
Monthly Charges 0.19
Electronic check 0.30
Internet Service 0.32
In [27]:
fig, ax = plt.subplots(nrows=1, ncols=3, figsize = (16, 6))
Temp = ['Tenure','Monthly Charges','Total Charges']

for i in range(len(Temp)):
    _ = sns.distplot(Data[Temp[i]], 
                     fit=norm, kde=False, color='seagreen', ax= ax[i])

Modeling

In [28]:
df = Data.drop(columns = ['customer ID'])
df = df.fillna(df.mean())
Target = 'Churn'
X = df.drop(columns = [Target])
y = df[Target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
               'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
Out[28]:
Set X_train X_test y_train y_test
Shape (4930, 22) (2113, 22) (4930,) (2113,)

Furthermore, we would like to standardize features by removing the mean and scaling to unit variance. In this article, we demonstrated the benefits of scaling data using StandardScaler().

In [29]:
scaler = StandardScaler()

X_train_STD = scaler.fit_transform(X_train)
X_test_STD = scaler.transform(X_test)

X_train_STD = pd.DataFrame(data = X_train_STD, columns = X_train.columns)
X_test_STD = pd.DataFrame(data = X_test_STD, columns = X_test.columns)

A number of functions that we would use.

In [30]:
def Performance(clf, X_test = X_test_STD):
    df = pd.DataFrame()
    y_pred = clf.predict(X_test)
    df = df.append({'Score': clf.score(X_test, y_test),
                    'F1 Score': f1_score(y_test.values, y_pred, average= 'weighted'),
                    'Precision Score': precision_score(y_test.values, y_pred, average= 'weighted'),
                    'Recall Score':  recall_score(y_test.values, y_pred, average= 'weighted')}, ignore_index=True)
    display(df.style.hide_index())

def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: SpringGreen' if v else '' for v in is_max]


def Feature_Ranking(clf):
    df = pd.DataFrame()
    for n in range(2, X.shape[1]):
        selector = RFE(estimator= clf, n_features_to_select=n, verbose=0)
        selector.fit(X_train_STD, y_train)
        df = df.append({'Number of Features to Select': n,
                        'Score':metrics.accuracy_score(y_test, selector.predict(X_test_STD)),
                        'Features': X.columns[selector.support_].tolist(),
                        'Best Features':X.columns[selector.ranking_ == 1].tolist()}, ignore_index=True)

    df = df[['Number of Features to Select', 'Score', 'Features', 'Best Features']]
    display(df.style.apply(highlight_max, subset=['Score']))
    return df.loc[df.Score == df.Score.max(), 'Features'].values[0]

def ROC_Curve(clf, X_test = X_test_STD):
    # false positive rates, true positive rates and thresholds
    fpr, tpr, threshold = metrics.roc_curve(y_test, clf.predict_proba(X_test)[:,1])

    fig, ax = plt.subplots(1, 1, figsize=(5.5, 5.5))
    _ = ax.plot(fpr, tpr, lw=2, label = 'AUC = %0.2f' % metrics.auc(fpr, tpr))
    _ = ax.plot([0, 1], [0, 1],'r--', lw=2)
    _ = ax.legend(loc = 'lower right', fontsize = 14)
    _ = ax.set_xlim([0,1])
    # _ = ax.set_ylim([0,1])
    _ = ax.set_xlabel('False Positive Rate (FPR)')
    _ = ax.set_ylabel('True Positive Rate (TPR)')

For this set of data, we would like to implement Random Forest Classifier. which creates a set of decision trees from a randomly selected subset of the training set. It benefits from voting of different decision trees for classifying the final outputs.

In [31]:
rfc = RandomForestClassifier()
_ = rfc.fit(X_train_STD,y_train)
Performance(rfc)
ROC_Curve(rfc)
F1 Score Precision Score Recall Score Score
0.780087 0.779921 0.792239 0.792239

However, we only need to implement features that are useful for classifications. In this article, we demonstrated the importance of feature ranking.

In [32]:
Best_Features = Feature_Ranking(rfc)
Number of Features to Select Score Features Best Features
0 2.000000 0.771415 ['Monthly Charges', 'Total Charges'] ['Monthly Charges', 'Total Charges']
1 3.000000 0.769522 ['Tenure', 'Monthly Charges', 'Total Charges'] ['Tenure', 'Monthly Charges', 'Total Charges']
2 4.000000 0.769995 ['Tenure', 'Contract', 'Monthly Charges', 'Total Charges'] ['Tenure', 'Contract', 'Monthly Charges', 'Total Charges']
3 5.000000 0.778987 ['Tenure', 'Internet Service', 'Contract', 'Monthly Charges', 'Total Charges'] ['Tenure', 'Internet Service', 'Contract', 'Monthly Charges', 'Total Charges']
4 6.000000 0.776621 ['Tenure', 'Internet Service', 'Contract', 'Monthly Charges', 'Total Charges', 'Electronic check'] ['Tenure', 'Internet Service', 'Contract', 'Monthly Charges', 'Total Charges', 'Electronic check']
5 7.000000 0.776621 ['Tenure', 'Internet Service', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check'] ['Tenure', 'Internet Service', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check']
6 8.000000 0.777567 ['Gender', 'Tenure', 'Internet Service', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check'] ['Gender', 'Tenure', 'Internet Service', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check']
7 9.000000 0.783247 ['Gender', 'Partner', 'Tenure', 'Internet Service', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check'] ['Gender', 'Partner', 'Tenure', 'Internet Service', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check']
8 10.000000 0.782773 ['Gender', 'Partner', 'Tenure', 'Internet Service', 'Online Security', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check'] ['Gender', 'Partner', 'Tenure', 'Internet Service', 'Online Security', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check']
9 11.000000 0.792712 ['Gender', 'Partner', 'Tenure', 'Internet Service', 'Online Security', 'Online Backup', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check'] ['Gender', 'Partner', 'Tenure', 'Internet Service', 'Online Security', 'Online Backup', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check']
10 12.000000 0.790345 ['Gender', 'Partner', 'Tenure', 'Internet Service', 'Online Security', 'Online Backup', 'Tech Support', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check'] ['Gender', 'Partner', 'Tenure', 'Internet Service', 'Online Security', 'Online Backup', 'Tech Support', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check']
11 13.000000 0.793185 ['Gender', 'Partner', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Tech Support', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check'] ['Gender', 'Partner', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Tech Support', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check']
12 14.000000 0.792239 ['Gender', 'Senior Citizen', 'Partner', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Tech Support', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check'] ['Gender', 'Senior Citizen', 'Partner', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Tech Support', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check']
13 15.000000 0.788452 ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Tech Support', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check'] ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Tech Support', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check']
14 16.000000 0.794132 ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check'] ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check']
15 17.000000 0.794605 ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check'] ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check']
16 18.000000 0.793658 ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check'] ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Electronic check']
17 19.000000 0.797918 ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Credit card (automatic)', 'Electronic check'] ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Credit card (automatic)', 'Electronic check']
18 20.000000 0.797444 ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Credit card (automatic)', 'Electronic check', 'Mailed check'] ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Credit card (automatic)', 'Electronic check', 'Mailed check']
19 21.000000 0.795551 ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Bank transfer (automatic)', 'Credit card (automatic)', 'Electronic check', 'Mailed check'] ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Bank transfer (automatic)', 'Credit card (automatic)', 'Electronic check', 'Mailed check']

Thus, the best features:

In [33]:
print(Best_Features)
['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Credit card (automatic)', 'Electronic check']
In [34]:
rfc = RandomForestClassifier()
_ = rfc.fit(X_train_STD[Best_Features],y_train)
Performance(rfc, X_test_STD[Best_Features])
ROC_Curve(rfc, X_test_STD[Best_Features])
F1 Score Precision Score Recall Score Score
0.783356 0.784129 0.796025 0.796025

The main classification metrics:

In [35]:
pd.DataFrame(classification_report(y_test,rfc.predict(X_test[Best_Features]),
                                   output_dict = True, target_names = ['No_Churn','Churn']))
Out[35]:
No_Churn Churn accuracy macro avg weighted avg
precision 0.728945 0.297872 0.719356 0.513409 0.611843
recall 0.978558 0.024390 0.719356 0.501474 0.719356
f1-score 0.835506 0.045089 0.719356 0.440297 0.620788
support 1539.000000 574.000000 0.719356 2113.000000 2113.000000

A confusion matrix allows the visualization of the performance of a classification model.

In [36]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
_ = plot_confusion_matrix(rfc, X_test_STD[Best_Features], y_test, display_labels= ['No_Churn','Churn'],
                      cmap= "Greens", normalize= 'true', ax = ax)
_ = ax.set_title('Normalized Confusion Matrix')

Final Thoughts

Finally, customer churn can be predicted using our model for the test data.

In [37]:
display(pd.DataFrame(list(rfc.predict_proba(X_test_STD[Best_Features].values)), columns=['No_Churn','Churn']))
No_Churn Churn
0 0.39 0.61
1 0.88 0.12
2 1.00 0.00
3 0.39 0.61
4 1.00 0.00
... ... ...
2108 0.09 0.91
2109 0.44 0.56
2110 0.95 0.05
2111 0.73 0.27
2112 0.85 0.15

2113 rows × 2 columns